import urllib.request
import os
url="http://biostat.mc.vanderbilt.edu/wiki/pub/Main/DataSets/titanic3.xls"
filepath="/Users/hannah/titanic3.xls"
if not os.path.isfile(filepath):
result=urllib.request.urlretrieve(url,filepath)
print('downloaded:',result)
output:
downloaded: ('/Users/hannah/titanic3.xls', <http.client.HTTPMessage object at 0x10a450400>)
import numpy
import pandas as pd
all_df=pd.read_excel(filepath)
all_df[:2]
output:
pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
cols=['survived','name','pclass','sex','age','sibsp','parch','fare','embarked']
all_df=all_df[cols]
all_df[:2]
output:
survived	name	pclass	sex	age	sibsp	parch	fare	embarked
0	1	Allen, Miss. Elisabeth Walton	1	female	29.0000	0	0	211.3375	S
1	1	Allison, Master. Hudson Trevor	1	male	0.9167	1	2	151.5500	S
df=all_df.drop(['name'],axis=1)
all_df.isnull().sum()
survived      0
name          0
pclass        0
sex           0
age         263
sibsp         0
parch         0
fare          1
embarked      2
dtype: int64
age_mean=df['age'].mean()
df['age']=df['age'].fillna(age_mean)
age_mean=df['fare'].mean()
df['fare']=df['fare'].fillna(age_mean)
df['sex']=df['sex'].map({'female':0,'male':1}).astype(int)
x_OneHot_df=pd.get_dummies(data=df,columns=["embarked"])
x_OneHot_df[:2]
output:
survived	pclass	sex	age	sibsp	parch	fare	embarked_C	embarked_Q	embarked_S
0	1	1	0	29.0000	0	0	211.3375	0	0	1
1	1	1	1	0.9167	1	2	151.5500	0	0	1
ndarray=x_OneHot_df.values
ndarray.shape
output:
(1309, 10)
ndarray[:2]
output:
array([[  1.    ,   1.    ,   0.    ,  29.    ,   0.    ,   0.    ,
211.3375,   0.    ,   0.    ,   1.    ],
[  1.    ,   1.    ,   1.    ,   0.9167,   1.    ,   2.    ,
151.55  ,   0.    ,   0.    ,   1.    ]])
Label=ndarray[:0]
Features=ndarray[:,1:]
Label[:2]
output:
array([], shape=(0, 10), dtype=float64)